lfp <- rbind(data.frame(lfp = df$lfp_female, gender ='F'), data.frame(lfp = df$lfp_male, gender ='M'))
avg_lfp <- mean(df$lfp_female)
avg_lfp_m <- mean(df$lfp_male)
ggplot(lfp, aes(x=lfp, fill=gender)) + geom_histogram(alpha=0.5, position="identity") + geom_vline(xintercept=avg_lfp_m) + geom_vline(xintercept=avg_lfp)+ scale_x_continuous(breaks=seq(0,100,10)) + labs(title='LFP by gender in Canada by census tract', x='LFP (%)')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
avg_gap <- mean(df$lfp_gap)
ggplot(df_no_geom, aes(lfp_gap)) + geom_histogram(color="black", fill="white", binwidth = 2) + geom_vline(xintercept = avg_gap, color='red') + labs(title='LFP Gap (M-F) in Canada by census tract', x='percentage points')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\[H_0: \text{driver}_{F} \geq \text{driver}_M \\ H_1: \text{driver}_{F} \lt \text{driver}_M\]
t.test(commute_modes_female$driver, commute_modes_male$driver, alternative='less', var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_modes_female$driver and commute_modes_male$driver
## t = -81.425, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -150.698
## sample estimates:
## mean of the differences
## -153.8055
Reject null hypothesis; less women commute as drivers than men.
\[H_0: \text{transit}_{F} \leq \text{transit}_M \\ H_1: \text{transit}_{F} \gt \text{transit}_M\]
t.test(commute_modes_female$transit, commute_modes_male$transit, alternative='greater', var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_modes_female$transit and commute_modes_male$transit
## t = 48.281, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 49.17677 Inf
## sample estimates:
## mean of the differences
## 50.91152
Reject null hypothesis; more women commute by public transit than men.
T-tests: all stat sig that male and female are diff
t.test(commute_time_female$t15, commute_time_male$t15, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t15 and commute_time_male$t15
## t = 40.692, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 37.10181 40.85763
## sample estimates:
## mean of the differences
## 38.97972
t.test(commute_time_female$t15to29, commute_time_male$t15to29, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t15to29 and commute_time_male$t15to29
## t = -20.524, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.36934 -19.29425
## sample estimates:
## mean of the differences
## -21.3318
t.test(commute_time_female$t30to44, commute_time_male$t30to44, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t30to44 and commute_time_male$t30to44
## t = -56.133, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -51.19870 -47.74324
## sample estimates:
## mean of the differences
## -49.47097
t.test(commute_time_female$t45to59, commute_time_male$t45to59, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t45to59 and commute_time_male$t45to59
## t = -30.318, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -19.02849 -16.71713
## sample estimates:
## mean of the differences
## -17.87281
t.test(commute_time_female$t60, commute_time_male$t60, var.equal=TRUE, paired=TRUE)
##
## Paired t-test
##
## data: commute_time_female$t60 and commute_time_male$t60
## t = -34.61, df = 5424, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.42807 -25.38022
## sample estimates:
## mean of the differences
## -26.90415
df_vars <- df_no_geom[iv_colnames]
df_vars$lfp_female <- df_no_geom$lfp_female
model_all <- lm(lfp_female ~ ., data=df_vars)
summary(model_all)
##
## Call:
## lm(formula = lfp_female ~ ., data = df_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.204 -2.433 0.017 2.356 47.163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.365109 0.816814 -26.157 < 2e-16 ***
## pca1_stock -0.316397 0.039223 -8.067 8.82e-16 ***
## med_hh_income_1000 -0.015140 0.003480 -4.351 1.38e-05 ***
## avg_rooms_per_dwelling 0.252348 0.089990 2.804 0.00506 **
## percent_hh_with_children -0.023203 0.005731 -4.049 5.22e-05 ***
## lfp_male 0.990068 0.008570 115.532 < 2e-16 ***
## percent_drivers_female 0.305795 0.012654 24.166 < 2e-16 ***
## percent_publictransit_female 0.026553 0.003933 6.751 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.948 on 5417 degrees of freedom
## Multiple R-squared: 0.7718, Adjusted R-squared: 0.7715
## F-statistic: 2617 on 7 and 5417 DF, p-value: < 2.2e-16
plot(residuals(model_all), ylab='residuals')
plot(residuals(model_all) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')
qqnorm(residuals(model_all), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_all))
bptest(model_all) # heteroskedasticity
##
## studentized Breusch-Pagan test
##
## data: model_all
## BP = 181.45, df = 7, p-value < 2.2e-16
model_no_sndi <- lm(lfp_female ~ . -pca1_stock, data=df_vars)
anova(model_no_sndi, model_all)
df_vars$cma_uid <- df$cma_uid
model_cma <- lmer(lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + percent_hh_with_children + percent_drivers_female + percent_publictransit_female + (1 | cma_uid), data=df_vars)
summary(model_cma)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling +
## percent_hh_with_children + percent_drivers_female + percent_publictransit_female +
## (1 | cma_uid)
## Data: df_vars
##
## REML criterion at convergence: 36773.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.5525 -0.5566 0.0669 0.6184 5.7482
##
## Random effects:
## Groups Name Variance Std.Dev.
## cma_uid (Intercept) 7.885 2.808
## Residual 50.156 7.082
## Number of obs: 5425, groups: cma_uid, 49
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.550e+01 1.220e+00 1.558e+03 37.284 < 2e-16
## pca1_stock -5.107e-01 7.517e-02 5.355e+03 -6.794 1.21e-11
## med_hh_income_1000 1.737e-01 6.473e-03 4.190e+03 26.832 < 2e-16
## avg_rooms_per_dwelling -2.912e+00 1.736e-01 5.033e+03 -16.772 < 2e-16
## percent_hh_with_children 1.047e-01 1.155e-02 5.403e+03 9.060 < 2e-16
## percent_drivers_female 3.686e-01 2.380e-02 5.418e+03 15.489 < 2e-16
## percent_publictransit_female 2.136e-02 7.206e-03 5.418e+03 2.965 0.00304
##
## (Intercept) ***
## pca1_stock ***
## med_hh_income_1000 ***
## avg_rooms_per_dwelling ***
## percent_hh_with_children ***
## percent_drivers_female ***
## percent_publictransit_female **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) pc1_st m___10 avg___ prc___ prcnt_d_
## pca1_stock 0.167
## md_hh__1000 0.190 0.091
## avg_rms_pr_ -0.222 -0.282 -0.677
## prcnt_hh_w_ -0.006 -0.154 -0.013 -0.463
## prcnt_drvr_ -0.728 -0.077 -0.024 -0.224 0.147
## prcnt_pblc_ -0.327 0.020 0.120 -0.014 -0.083 -0.020
plot(residuals(model_cma), ylab='residuals')
plot(residuals(model_cma) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')
qqnorm(residuals(model_cma), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_cma))
df_vars$lfp_gap <- df_vars$lfp_male - df_vars$lfp_female
model_gap <- lm(lfp_gap ~ . -lfp_female -lfp_male -cma_uid, data=df_vars)
summary(model_gap)
##
## Call:
## lm(formula = lfp_gap ~ . - lfp_female - lfp_male - cma_uid, data = df_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.333 -2.333 -0.028 2.428 17.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.042585 0.570525 38.636 < 2e-16 ***
## pca1_stock 0.314628 0.039194 8.027 1.21e-15 ***
## med_hh_income_1000 0.017064 0.003059 5.579 2.54e-08 ***
## avg_rooms_per_dwelling -0.282646 0.086112 -3.282 0.00104 **
## percent_hh_with_children 0.024156 0.005672 4.259 2.09e-05 ***
## percent_drivers_female -0.305210 0.012644 -24.139 < 2e-16 ***
## percent_publictransit_female -0.026643 0.003933 -6.775 1.38e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.948 on 5418 degrees of freedom
## Multiple R-squared: 0.1459, Adjusted R-squared: 0.1449
## F-statistic: 154.2 on 6 and 5418 DF, p-value: < 2.2e-16
plot(residuals(model_gap), ylab='residuals')
plot(residuals(model_gap) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')
qqnorm(residuals(model_gap), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_gap))
##
## Call:
## lm(formula = lfp_gap ~ . - lfp_male, data = mtl_data_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.478 -2.737 0.170 2.877 13.040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.33066 2.25917 12.540 < 2e-16 ***
## pca1_stock 0.96955 0.20921 4.634 4.67e-06 ***
## med_hh_income_1000 0.03517 0.01864 1.887 0.059811 .
## avg_rooms_per_dwelling -2.61547 0.54491 -4.800 2.15e-06 ***
## percent_hh_with_children 0.21974 0.02809 7.824 3.55e-14 ***
## percent_drivers_female -0.13353 0.03726 -3.584 0.000374 ***
## percent_publictransit_female -0.22254 0.03488 -6.381 4.31e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.47 on 459 degrees of freedom
## Multiple R-squared: 0.2874, Adjusted R-squared: 0.2781
## F-statistic: 30.86 on 6 and 459 DF, p-value: < 2.2e-16
##
## studentized Breusch-Pagan test
##
## data: model_mtl
## BP = 14.015, df = 6, p-value = 0.02947
##
## Call:
## lm(formula = lfp_gap ~ . - lfp_male, data = to_data_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.3863 -2.3234 0.0213 2.2864 13.7066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.190825 1.994000 13.135 < 2e-16 ***
## pca1_stock 0.645885 0.145511 4.439 1.09e-05 ***
## med_hh_income_1000 0.040870 0.009039 4.522 7.50e-06 ***
## avg_rooms_per_dwelling -1.266213 0.273926 -4.622 4.72e-06 ***
## percent_hh_with_children 0.134199 0.018958 7.079 4.39e-12 ***
## percent_drivers_female -0.184561 0.033647 -5.485 6.27e-08 ***
## percent_publictransit_female -0.224325 0.029703 -7.552 1.76e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.861 on 558 degrees of freedom
## Multiple R-squared: 0.2836, Adjusted R-squared: 0.2759
## F-statistic: 36.82 on 6 and 558 DF, p-value: < 2.2e-16